NB: Set the argument output_image of the plotting functions to False when running the notebook to have interactive plots (some have a dropdown that allows to choose the province/region)
import pandas as pd
import numpy as np
import datetime
from scipy.integrate import odeint
import lmfit
from pathlib import Path
from sklearn.metrics import mean_absolute_error, mean_squared_error
import os, sys
sys.path.insert(0, os.path.abspath('../src'))
from plots import *
from utils import *
from sird import *
data_path = "../data"
DataDownloader(data_path).download_all_csv()
covidpro_df, dpc_regioni_df, dpc_province_df, pop_prov_df, prov_list_df = load_data(data_path)
Stochastic and continuous with logistic Rt
province = 'Firenze'
sirsol = sird(province, pop_prov_df)
S, I, R, D = sirsol
times = list(range(sirsol.shape[1]))
general_plot(t=times,
data=sirsol,
title='SIRD ' + province,
traces_visibility=['legendonly'] + [True]*3,
output_image=True,
template='simple_white')
# Uncomment for interactive plots
#result = pd.DataFrame()
#for prov in covidpro_df.Province.unique():
# S, I, R, D = sird(prov, pop_prov_df)
# tmp = pd.DataFrame(np.column_stack([[prov]*len(S),range(len(S)),S,I,R,D]))
# result = pd.concat([result, tmp])
#
#result.columns = ["Province", "t", "S", "I", "R", "D"]
#result.reset_index(drop=True,inplace=True)
#inter_dropdown_plot(options=result.Province.unique(),
# default_value='Firenze',
# dropdown_label='Province',
# y=["S", "I", "R", "D"],
# legend_titles=['Susceptible', 'Infected', 'Recovered', 'Dead'],
# data=result,
# group_column='Province',
# x='t',
# title='COVID-19 trendlines of ',
# xtitle='Data',
# ytitle='Unità',
# traces_visibility=['legendonly',True,True,True],
# output_image=False)
names, title, data, modes = data_for_plot('Infected', covidpro_df, 'New_cases', I, province)
general_plot(t=times,
title=title,
data=data,
names=names,
modes=modes,
blend_legend=True,
output_image=True,
traces_visibility=['legendonly',True,True],
template='simple_white')
from sklearn.metrics import mean_absolute_error, mean_squared_error
mean_absolute_error(data[1], data[2]), mean_squared_error(data[1], data[2])
(10.684018260735181, 198.36702622348741)
names, title, data, modes = data_for_plot('Deaths', covidpro_df, 'Tot_deaths', D, province)
general_plot(t=times,
title=title,
data=data,
names=names,
modes=modes,
blend_legend=False,
output_image=True,
traces_visibility=[True,'legendonly',True],
template='simple_white')
mean_absolute_error(data[1], data[2]), mean_squared_error(data[1], data[2])
(139.55227138835534, 38606.3719889688)
result = pd.DataFrame()
for prov in covidpro_df.Province.unique():
_, I, _, _ = sird(prov, pop_prov_df)
names, _, data, _ = data_for_plot('Infected', covidpro_df, 'New_cases', I, prov)
tmp = pd.DataFrame(np.column_stack([[prov]*data[2].shape[0],range(data[2].shape[0]),data[0],data[1],data[2]]))
result = pd.concat([result, tmp])
result.columns = ["Province", "t", "real", "real_roll", "pred"]
result.reset_index(drop=True,inplace=True)
inter_dropdown_plot(options=result.Province.unique(),
default_value='Bologna',
dropdown_label='Province',
y=["real", "real_roll", "pred"],
legend_titles=names,
data=result,
group_column='Province',
x='t',
title='COVID-19 prediction of ',
xtitle='Time (days)',
ytitle='Number of individuals',
output_image=True,
traces_visibility=['legendonly',True,True],
template='simple_white')
mean_absolute_error(result['real_roll'], result['pred']), mean_squared_error(result['real_roll'], result['pred'])
(20.671333591115857, 1242.7771052357236)
SIRD model fitting using least squares
mapping = {
'New_cases': 2,
'Curr_pos_cases': 2,
'Tot_deaths': 4,
'Deaths': 4,
'Infected': 2
}
def fitter(x, R_0_start, k, x0, R_0_end, alpha, gamma):
ret = Model(days, N, R_0_start, k, x0, R_0_end, alpha, gamma)
return ret[mapping[compart]][x]
def get_model(province, compart, data_df, pop_df, params_init_min_max=None, query='20200603 > Date', outbreak_shift=20, window=7):
data = data_df[data_df.Province == province].query(query)[compart]
if compart in ['New_cases', 'Deaths']:
data = data.rolling(window).mean().fillna(0)
N = pop_df.loc[(pop_df.Territorio == province) & (pop_df.Eta == "Total")]['Value'].values[0]
# {parameter: (initial guess, min value, max value)}
if params_init_min_max == None:
params_init_min_max = {
"R_0_start": (3.5, 1.0, 6),
"k": (0.3, 0.01, 5.0),
"x0": (20, 0, 100),
"R_0_end": (0.9, 0.01, 3.5),
"alpha": (0.1, 0.00000001, 1),
"gamma": (1/7, 0.00000001, 1)
}
days = outbreak_shift + len(data)
if outbreak_shift >= 0:
y_data = np.concatenate((np.zeros(outbreak_shift), data))
else:
y_data = y_data[-outbreak_shift:]
# [0, 1, ..., days]
x_data = np.linspace(0, days-1, days, dtype=int)
mod = lmfit.Model(fitter)
for kwarg, (init, mini, maxi) in params_init_min_max.items():
mod.set_param_hint(str(kwarg), value=init, min=mini, max=maxi, vary=True)
params = mod.make_params()
return mod, params, y_data, x_data, days, N
No outbreak shift
province = 'Firenze'
compart = 'Curr_pos_cases'
mod, params, y_data, x_data, days, N = get_model(province, compart, covidpro_df, pop_prov_df, outbreak_shift=0)
time = datetime.datetime.now()
result = mod.fit(y_data, params, method="leastsq", x=x_data)
print('Fitting completed in {}'.format(datetime.datetime.now() - time))
Fitting completed in 0:00:00.784015
lmfit.report_fit(result)
[[Fit Statistics]]
# fitting method = leastsq
# function evals = 315
# data points = 100
# variables = 6
chi-square = 104885.665
reduced chi-square = 1115.80494
Akaike info crit = 707.545594
Bayesian info crit = 723.176615
[[Variables]]
R_0_start: 4.00060097 +/- 15921.8921 (397987.51%) (init = 3.5)
k: 0.08401433 +/- 0.00626347 (7.46%) (init = 0.3)
x0: 18.4122033 +/- 2.33525570 (12.68%) (init = 20)
R_0_end: 1.32125873 +/- 5259.92217 (398099.33%) (init = 0.9)
alpha: 0.04520277 +/- 619.101855 (1369610.56%) (init = 0.1)
gamma: 0.14015531 +/- 557.884284 (398047.20%) (init = 0.1428571)
[[Correlations]] (unreported correlations are < 0.100)
C(R_0_end, alpha) = 1.000
C(R_0_start, gamma) = -1.000
C(R_0_start, R_0_end) = 1.000
C(R_0_end, gamma) = -1.000
C(R_0_start, alpha) = 1.000
C(alpha, gamma) = -1.000
C(k, x0) = 0.970
C(x0, alpha) = 0.135
C(x0, R_0_end) = 0.135
C(R_0_start, x0) = 0.135
C(x0, gamma) = -0.135
C(k, alpha) = 0.114
C(k, R_0_end) = 0.114
C(R_0_start, k) = 0.114
C(k, gamma) = -0.114
general_plot(t=x_data,
title='Model fitter: ' + compart + ' ' + province,
data=[result.data, result.best_fit],
names=['Real', 'Best fit'],
modes=['markers', 'lines'],
blend_legend=False,
output_image=True)
mean_absolute_error(result.data, result.best_fit), mean_squared_error(result.data, result.best_fit)
(21.052558582188144, 1048.8566477536028)
result.best_values
{'R_0_start': 4.000600973282221,
'k': 0.08401433194087307,
'x0': 18.41220330012675,
'R_0_end': 1.3212587303965662,
'alpha': 0.045202765880996774,
'gamma': 0.140155310496441}
sirsol = sird(province, pop_prov_df, **result.best_values)
S, I, R, D = sirsol
names, title, data, _ = data_for_plot('Cumulative infected', covidpro_df, 'Curr_pos_cases', I, province)
times = list(range(sirsol.shape[1]))
general_plot(t=times,
title=title,
data=data,
names=names,
modes=['markers', 'markers', 'lines'],
blend_legend=True,
output_image=True,
traces_visibility=['legendonly',True,True])
# Error here is shown on the rolling 7 days!
mean_absolute_error(data[1], data[2]), mean_squared_error(data[1], data[2])
(103.01746070219464, 19309.73755678003)
No outbreak shift
compart = 'New_cases'
mod, params, y_data, x_data, days, N = get_model(province, compart, covidpro_df, pop_prov_df, outbreak_shift=0)
time = datetime.datetime.now()
result = mod.fit(y_data, params, method="leastsq", x=x_data)
print('Fitting completed in {}'.format(datetime.datetime.now() - time))
Fitting completed in 0:00:01.010773
lmfit.report_fit(result)
[[Fit Statistics]]
# fitting method = leastsq
# function evals = 467
# data points = 100
# variables = 6
chi-square = 5137.54834
reduced chi-square = 54.6547696
Akaike info crit = 405.916108
Bayesian info crit = 421.547129
[[Variables]]
R_0_start: 3.05288038 +/- 413.392448 (13541.06%) (init = 3.5)
k: 0.14421469 +/- 0.05549546 (38.48%) (init = 0.3)
x0: 35.1144714 +/- 2.75302411 (7.84%) (init = 20)
R_0_end: 1.75653544 +/- 62.2097625 (3541.62%) (init = 0.9)
alpha: 0.21732658 +/- 47.6931081 (21945.36%) (init = 0.1)
gamma: 0.16314855 +/- 46.7198532 (28636.39%) (init = 0.1428571)
[[Correlations]] (unreported correlations are < 0.100)
C(R_0_start, gamma) = -0.993
C(alpha, gamma) = 0.980
C(R_0_start, alpha) = -0.949
C(x0, alpha) = -0.934
C(x0, gamma) = -0.928
C(k, alpha) = 0.910
C(R_0_start, x0) = 0.906
C(k, gamma) = 0.896
C(R_0_start, k) = -0.870
C(k, x0) = -0.765
C(R_0_start, R_0_end) = 0.704
C(R_0_end, gamma) = -0.615
C(x0, R_0_end) = 0.471
C(R_0_end, alpha) = -0.445
C(k, R_0_end) = -0.421
general_plot(t=x_data,
title='Model fitter: ' + compart + ' ' + province,
data=[result.data, result.best_fit],
names=['Real (rolling 7 days)', 'Best fit'],
modes=['markers', 'lines'],
blend_legend=False,
output_image=True,
horiz_legend=True,
template='plotly_white')
mean_absolute_error(result.data, result.best_fit), mean_squared_error(result.data, result.best_fit)
(5.202194053266722, 51.37548343795631)
result.best_values
{'R_0_start': 3.052880378004642,
'k': 0.1442146900847878,
'x0': 35.11447143059068,
'R_0_end': 1.7565354397122144,
'alpha': 0.2173265751223985,
'gamma': 0.16314855326415773}
sirsol = sird(province, pop_prov_df, **result.best_values)
S, I, R, D = sirsol
names, title, data, _ = data_for_plot('Infected', covidpro_df, 'New_cases', I, province)
general_plot(t=times,
title=title,
data=data,
names=names,
modes=['markers', 'markers', 'lines'],
blend_legend=False,
output_image=True,
traces_visibility=['legendonly',True,True],
template='plotly_white')
mean_absolute_error(data[1], data[2]), mean_squared_error(data[1], data[2])
(5.174296292663379, 50.92311168845561)
names, title, data, _ = data_for_plot('Cumulative deaths', covidpro_df, 'Tot_deaths', D, province)
times = list(range(sirsol.shape[1]))
general_plot(t=times,
title=title,
data=data,
names=names,
modes=['markers', 'markers', 'lines'],
blend_legend=False,
output_image=True,
horiz_legend=True,
traces_visibility=[False,True,True],
template='plotly_white')
mean_absolute_error(data[1], data[2]), mean_squared_error(data[1], data[2])
(244.3075423180887, 85893.60453131823)
Outbreak shift = 10
province = 'Firenze'
compart = 'New_cases'
mod, params, y_data, x_data, days, N = get_model(province, compart, covidpro_df, pop_prov_df, outbreak_shift=10)
result = mod.fit(y_data, params, method="leastsq", x=x_data)
lmfit.report_fit(result)
[[Fit Statistics]]
# fitting method = leastsq
# function evals = 1262
# data points = 110
# variables = 6
chi-square = 5002.23991
reduced chi-square = 48.0984607
Akaike info crit = 431.887678
Bayesian info crit = 448.090560
[[Variables]]
R_0_start: 4.05215671 +/- 1.31330311 (32.41%) (init = 3.5)
k: 2.93264960 +/- 17.4557225 (595.22%) (init = 0.3)
x0: 49.9246389 +/- 2.58447307 (5.18%) (init = 20)
R_0_end: 3.50000000 +/- 1.26515891 (36.15%) (init = 0.9)
alpha: 0.90631996 +/- 2.23885190 (247.03%) (init = 0.1)
gamma: 0.25301505 +/- 0.45485163 (179.77%) (init = 0.1428571)
[[Correlations]] (unreported correlations are < 0.100)
C(alpha, gamma) = 0.984
C(k, x0) = -0.929
C(x0, alpha) = 0.889
C(k, alpha) = -0.884
C(x0, R_0_end) = 0.828
C(k, gamma) = -0.812
C(x0, gamma) = 0.797
C(k, R_0_end) = -0.725
C(R_0_start, R_0_end) = 0.676
C(R_0_end, alpha) = 0.501
C(R_0_start, gamma) = -0.462
C(R_0_end, gamma) = 0.341
C(R_0_start, alpha) = -0.299
C(R_0_start, x0) = 0.157
general_plot(t=x_data,
title='Model fitter: ' + compart + ' ' + province,
data=[result.data, result.best_fit],
names=['Real (rolling 7 days)', 'Best fit'],
modes=['markers', 'lines'],
blend_legend=False,
output_image=True)
mean_absolute_error(result.data, result.best_fit), mean_squared_error(result.data, result.best_fit)
(5.1353624823786514, 45.474908307785185)
result.best_values
{'R_0_start': 4.052156711003933,
'k': 2.9326495956369127,
'x0': 49.92463894319742,
'R_0_end': 3.4999999963869994,
'alpha': 0.9063199642227926,
'gamma': 0.2530150538119541}
sirsol = sird(province, pop_prov_df, **result.best_values)
S, I, R, D = sirsol
names, title, data, _ = data_for_plot('Infected', covidpro_df, 'New_cases', I, province)
general_plot(t=times,
title=title,
data=data,
names=names,
modes=['markers', 'markers', 'lines'],
blend_legend=True,
output_image=True,
traces_visibility=['legendonly',True,True])
mean_absolute_error(data[1], data[2]), mean_squared_error(data[1], data[2])
(19.113807034585523, 613.9534468451312)
No outbreak shift
province = 'Bologna'
compart = 'New_cases'
mod, params, y_data, x_data, days, N = get_model(province, compart, covidpro_df, pop_prov_df, outbreak_shift=0)
result = mod.fit(y_data, params, method="leastsq", x=x_data)
lmfit.report_fit(result)
[[Fit Statistics]]
# fitting method = leastsq
# function evals = 728
# data points = 100
# variables = 6
chi-square = 5177.09762
reduced chi-square = 55.0755066
Akaike info crit = 406.682969
Bayesian info crit = 422.313990
[[Variables]]
R_0_start: 3.85781475 +/- 0.77411065 (20.07%) (init = 3.5)
k: 0.75424466 +/- 0.26522806 (35.16%) (init = 0.3)
x0: 32.6953830 +/- 0.32227397 (0.99%) (init = 20)
R_0_end: 3.15554101 +/- 0.46050078 (14.59%) (init = 0.9)
alpha: 0.84599519 +/- 2.05883003 (243.36%) (init = 0.1)
gamma: 0.27053191 +/- 0.46207082 (170.80%) (init = 0.1428571)
[[Correlations]] (unreported correlations are < 0.100)
C(alpha, gamma) = 1.000
C(R_0_start, gamma) = -1.000
C(R_0_start, alpha) = -1.000
C(R_0_end, alpha) = 1.000
C(R_0_end, gamma) = 1.000
C(R_0_start, R_0_end) = -1.000
C(k, R_0_end) = 0.261
C(R_0_start, k) = -0.254
C(k, alpha) = 0.253
C(k, gamma) = 0.252
C(R_0_start, x0) = 0.144
C(x0, gamma) = -0.143
C(x0, alpha) = -0.142
C(x0, R_0_end) = -0.135
general_plot(t=x_data,
title='Model fitter: ' + compart + ' ' + province,
data=[result.data, result.best_fit],
names=['Real', 'Best fit'],
modes=['markers', 'lines'],
blend_legend=False,
output_image=True)
mean_absolute_error(result.data, result.best_fit), mean_squared_error(result.data, result.best_fit)
(5.688712301757504, 51.77097619740428)
sirsol = sird(province, pop_prov_df, **result.best_values)
S, I, R, D = sirsol
names, title, data, _ = data_for_plot('Infected', covidpro_df, 'New_cases', I, province)
times = list(range(sirsol.shape[1]))
general_plot(t=times,
title=title,
data=data,
names=names,
modes=['markers', 'markers', 'lines'],
blend_legend=True,
output_image=True,
traces_visibility=['legendonly',True,True])
mean_absolute_error(data[1], data[2]), mean_squared_error(data[1], data[2])
(5.671218728735603, 51.410679374153005)
province = 'Firenze'
compart = 'Tot_deaths'
mod, params, y_data, x_data, days, N = get_model(province, compart, covidpro_df, pop_prov_df, outbreak_shift=0)
result = mod.fit(y_data, params, method="leastsq", x=x_data)
lmfit.report_fit(result)
[[Fit Statistics]]
# fitting method = leastsq
# function evals = 716
# data points = 100
# variables = 6
chi-square = 1835.09090
reduced chi-square = 19.5222436
Akaike info crit = 302.967911
Bayesian info crit = 318.598932
[[Variables]]
R_0_start: 2.97333634 +/- 3707.81606 (124702.21%) (init = 3.5)
k: 0.07053673 +/- 0.39938950 (566.21%) (init = 0.3)
x0: 40.9606482 +/- 58.0157590 (141.64%) (init = 20)
R_0_end: 0.32330740 +/- 1252.06758 (387268.46%) (init = 0.9)
alpha: 0.01152189 +/- 0.02590452 (224.83%) (init = 0.1)
gamma: 0.11878142 +/- 222.433658 (187263.00%) (init = 0.1428571)
[[Correlations]] (unreported correlations are < 0.100)
C(R_0_start, R_0_end) = -1.000
C(R_0_start, gamma) = -1.000
C(R_0_end, gamma) = 1.000
C(k, gamma) = -0.998
C(R_0_start, k) = 0.998
C(k, R_0_end) = -0.998
C(k, x0) = 0.998
C(x0, gamma) = -0.993
C(x0, R_0_end) = -0.993
C(R_0_start, x0) = 0.993
C(x0, alpha) = 0.953
C(k, alpha) = 0.931
C(alpha, gamma) = -0.911
C(R_0_end, alpha) = -0.911
C(R_0_start, alpha) = 0.911
general_plot(t=x_data,
title='Model fitter: ' + compart + ' ' + province,
data=[result.data, result.best_fit],
names=['Real', 'Best fit'],
modes=['markers', 'lines'],
blend_legend=False,
output_image=True)
mean_absolute_error(result.data, result.best_fit), mean_squared_error(result.data, result.best_fit)
(3.173061596258741, 18.350908956097907)
sirsol = sird(province, pop_prov_df, **result.best_values)
S, I, R, D = sirsol
names, title, data, _ = data_for_plot('Cumulative deaths', covidpro_df, 'Tot_deaths', D, province)
general_plot(t=times,
title=title,
data=data,
names=names,
modes=['markers', 'markers', 'lines'],
blend_legend=False,
output_image=True,
traces_visibility=['legendonly',True,True])
mean_absolute_error(data[1], data[2]), mean_squared_error(data[1], data[2])
(11.608808780074535, 239.59051866004114)
sirsol = sird(province, pop_prov_df, **result.best_values)
S, I, R, D = sirsol
names, title, data, _ = data_for_plot('Infected', covidpro_df, 'New_cases', I, province)
general_plot(t=times,
title=title,
data=data,
names=names,
modes=['markers', 'markers', 'lines'],
blend_legend=True,
output_image=True,
traces_visibility=['legendonly',True,True])
mean_absolute_error(data[1], data[2]), mean_squared_error(data[1], data[2])
(294.3665101336331, 159799.66154737442)
No outbreak shift
from lmfit.models import SkewedGaussianModel
province = 'Bologna'
compart = 'New_cases'
_, _, y_data, x_data, _, _ = get_model(province, compart, covidpro_df, pop_prov_df, outbreak_shift=0)
model = SkewedGaussianModel()
params = model.guess(y_data, x=x_data)
result = model.fit(y_data, params, x=x_data)
general_plot(t=x_data,
title='Model fitter: ' + compart + ' ' + province,
data=[result.data, result.best_fit],
names=['Real', 'Best fit'],
modes=['markers', 'lines'],
blend_legend=False,
output_image=True)
mean_absolute_error(result.data, result.best_fit), mean_squared_error(result.data, result.best_fit)
(6.034830571902737, 68.41045143333051)
print(lmfit.fit_report(result))
[[Fit Statistics]]
# fitting method = leastsq
# function evals = 81
# data points = 100
# variables = 4
chi-square = 6841.04514
reduced chi-square = 71.2608869
Akaike info crit = 430.552561
Bayesian info crit = 440.973242
[[Variables]]
amplitude: 4980.53084 +/- 80.1918752 (1.61%) (init = 6872.143)
center: 25.8493059 +/- 0.30152642 (1.17%) (init = 42.56667)
sigma: 28.9202731 +/- 0.66437112 (2.30%) (init = 15)
gamma: 5.45079057 +/- 0.50824227 (9.32%) (init = 0)
[[Correlations]] (unreported correlations are < 0.100)
C(center, sigma) = -0.591
C(sigma, gamma) = 0.552
C(amplitude, sigma) = 0.522
C(center, gamma) = -0.377
C(amplitude, center) = -0.290
mapping_regional = {
'nuovi_positivi': 2,
'totale_positivi': 2,
'deceduti_giorni0': 4,
'deceduti': 4,
}
def fitter_regional(x, R_0_start, k, x0, R_0_end, alpha, gamma):
ret = Model(days, N, R_0_start, k, x0, R_0_end, alpha, gamma)
return ret[mapping_regional[compart]][x]
def get_model_regional(region, compart, data_df, pop_df, params_init_min_max=None, query='20200603 > data', outbreak_shift=20, window=7):
data = data_df[data_df.denominazione_regione == region].query(query)[compart]
if compart in ['nuovi_positivi', 'deceduti_giorno']:
data = data.rolling(window).mean().fillna(0)
N = get_region_pop(region, pop_df, prov_list_df)
# {parameter: (initial guess, min value, max value)}
if params_init_min_max == None:
params_init_min_max = {
"R_0_start": (3.5, 1.0, 6),
"k": (0.3, 0.01, 5.0),
"x0": (20, 0, 100),
"R_0_end": (0.9, 0.01, 3.5),
"alpha": (0.1, 0.00000001, 1),
"gamma": (1/7, 0.00000001, 1)
}
days = outbreak_shift + len(data)
if outbreak_shift >= 0:
y_data = np.concatenate((np.zeros(outbreak_shift), data))
else:
y_data = y_data[-outbreak_shift:]
# [0, 1, ..., days]
x_data = np.linspace(0, days-1, days, dtype=int)
mod = lmfit.Model(fitter_regional)
for kwarg, (init, mini, maxi) in params_init_min_max.items():
mod.set_param_hint(str(kwarg), value=init, min=mini, max=maxi, vary=True)
params = mod.make_params()
return mod, params, y_data, x_data, days, N
province = 'Piemonte'
compart = 'totale_positivi'
mod, params, y_data, x_data, days, N = get_model_regional(province, compart, dpc_regioni_df, pop_prov_df, outbreak_shift=0)
time = datetime.datetime.now()
result = mod.fit(y_data, params, method="leastsq", x=x_data)
print('Fitting done in {}'.format(datetime.datetime.now() - time))
Fitting done in 0:00:02.065748
lmfit.report_fit(result)
[[Fit Statistics]]
# fitting method = leastsq
# function evals = 796
# data points = 100
# variables = 6
chi-square = 7653709.68
reduced chi-square = 81422.4434
Akaike info crit = 1136.55308
Bayesian info crit = 1152.18410
[[Variables]]
R_0_start: 2.81509466 +/- 4425.94420 (157221.86%) (init = 3.5)
k: 0.40670848 +/- 0.07517720 (18.48%) (init = 0.3)
x0: 23.2394664 +/- 0.84675312 (3.64%) (init = 20)
R_0_end: 2.05528883 +/- 3231.82136 (157244.15%) (init = 0.9)
alpha: 0.57836314 +/- 424.531616 (73402.26%) (init = 0.1)
gamma: 0.39056083 +/- 613.822884 (157164.48%) (init = 0.1428571)
[[Correlations]] (unreported correlations are < 0.100)
C(R_0_end, gamma) = -1.000
C(R_0_start, gamma) = -1.000
C(R_0_start, R_0_end) = 1.000
C(R_0_end, alpha) = 1.000
C(alpha, gamma) = -1.000
C(R_0_start, alpha) = 1.000
C(k, x0) = 0.882
general_plot(t=x_data,
title='Model fitter: ' + compart + ' ' + province,
data=[result.data, result.best_fit],
names=['Real', 'Best fit'],
modes=['markers', 'lines'],
blend_legend=False,
output_image=True)
mean_absolute_error(result.data, result.best_fit), mean_squared_error(result.data, result.best_fit)
(220.20614105784284, 76537.0967817327)
result.best_values
{'R_0_start': 2.8150946636121557,
'k': 0.40670848072457977,
'x0': 23.23946640539682,
'R_0_end': 2.055288830367861,
'alpha': 0.5783631359463409,
'gamma': 0.39056082816987375}
Discrete parameter estimation using LinearRegression.
Compute parameters for each day and then predict new days
regione = 'Piemonte'
pop = get_region_pop(regione, pop_prov_df, prov_list_df)
filtered_df = dpc_regioni_df[dpc_regioni_df.denominazione_regione == regione][['data', 'totale_positivi', 'dimessi_guariti', 'deceduti', 'totale_casi', 'nuovi_positivi']]
filtered_df = filtered_df.query('20200630 > data')
filtered_df['suscettibili'] = pop - filtered_df['totale_casi']
filtered_df = filtered_df[['data', 'totale_positivi', 'dimessi_guariti', 'deceduti', 'suscettibili', 'nuovi_positivi']]
n = filtered_df.shape[0]
gamma = np.diff(filtered_df['dimessi_guariti'].values)/filtered_df.iloc[:n-1]['totale_positivi'].values
alpha = np.diff(filtered_df['deceduti'].values)/filtered_df.iloc[:n-1]['totale_positivi'].values
beta = (pop/filtered_df.iloc[:n-1]['suscettibili'].values)*(np.diff(filtered_df['totale_positivi'].values)+np.diff(filtered_df['dimessi_guariti'].values)+np.diff(filtered_df['deceduti'].values))/filtered_df.iloc[:n-1]['totale_positivi'].values
R0 = beta/(gamma+alpha)
def fix_arr(arr):
arr[arr < 0] = 0
arr[np.isinf(arr)] = 0
return np.nan_to_num(arr)
gamma = fix_arr(gamma)
alpha = fix_arr(alpha)
beta = fix_arr(beta)
R0 = fix_arr(R0)
def lag_data(data, lag=5, return_all=False):
N = len(data)
X = np.empty(shape=(N-lag, lag+1))
for i in range(lag, N):
X[i-lag,] = [data[i-j] for j in range(lag+1)]
if not return_all:
return X[-1,1:]
else:
return X[:,1:], X[:,0]
from sklearn.linear_model import LinearRegression
lag = 7
reg_beta = LinearRegression().fit(*lag_data(beta, lag, True))
reg_gamma = LinearRegression().fit(*lag_data(gamma, lag, True))
reg_alpha = LinearRegression().fit(*lag_data(alpha, lag, True))
days_to_predict = 14
S = np.zeros(days_to_predict + 2)
I = np.zeros(days_to_predict + 2)
R = np.zeros(days_to_predict + 2)
D = np.zeros(days_to_predict + 2)
S[0] = filtered_df.iloc[-1]['suscettibili']
I[0] = filtered_df.iloc[-1]['totale_positivi']
R[0] = filtered_df.iloc[-1]['dimessi_guariti']
D[0] = filtered_df.iloc[-1]['deceduti']
for i in range(days_to_predict + 1):
_beta = fix_arr(reg_beta.predict(lag_data(beta, lag).reshape(1, -1)))
_gamma = fix_arr(reg_gamma.predict(lag_data(gamma, lag).reshape(1, -1)))
_alpha = fix_arr(reg_alpha.predict(lag_data(alpha, lag).reshape(1, -1)))
beta = np.append(beta, _beta, axis = 0)
gamma = np.append(gamma, _gamma, axis = 0)
alpha = np.append(alpha, _alpha, axis = 0)
dIdt = np.round((1 + _beta * (S[i]/pop) - _gamma - _alpha)*I[i])
dRdt = np.round(R[i] + _gamma * I[i])
dDdt = np.round(D[i] + _alpha * I[i])
dSdt = pop-dIdt[0]-dRdt[0]-dDdt[0]
S[i+1] = dSdt
I[i+1] = dIdt
R[i+1] = dRdt
D[i+1] = dDdt
S = S[1:]
I = I[1:]
R = R[1:]
D = D[1:]
dates = pd.date_range(start=(filtered_df.iloc[-1]['data'] + pd.DateOffset(1)).strftime('%Y-%m-%d'), periods=days_to_predict + 1)
tmp_df = pd.DataFrame(np.column_stack([np.zeros(days_to_predict + 1), I, R, D, S]),
columns = [
'data',
'totale_positivi',
'dimessi_guariti',
'deceduti',
'suscettibili'])
tmp_df['data'] = dates
filtered_df = pd.concat([filtered_df, tmp_df], ignore_index=True)
filtered_df['nuovi_positivi'] = [0]+list(np.diff(filtered_df['totale_positivi'].values)+np.diff(filtered_df['dimessi_guariti'].values)+np.diff(filtered_df['deceduti'].values))
filtered_df['nuovi_positivi'] = filtered_df['nuovi_positivi'].apply(lambda x: 0 if x < 0 else x)
beta = np.append(beta, np.zeros((1,)), axis = 0)
gamma = np.append(gamma, np.zeros((1,)), axis = 0)
alpha = np.append(alpha, np.zeros((1,)), axis = 0)
filtered_df['beta'] = beta
filtered_df['gamma'] = gamma
filtered_df['alpha'] = alpha
filtered_df['R0'] = fix_arr(beta/(gamma+alpha))
filtered_df = filtered_df[:-1]
filtered_df = filtered_df.astype({'totale_positivi': 'int32',
'dimessi_guariti': 'int32', 'deceduti': 'int32', 'suscettibili': 'int32', 'nuovi_positivi': 'int32'})
filtered_df
| data | totale_positivi | dimessi_guariti | deceduti | suscettibili | nuovi_positivi | beta | gamma | alpha | R0 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2020-02-24 | 3 | 0 | 0 | 4341372 | 0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 1 | 2020-02-25 | 3 | 0 | 0 | 4341372 | 0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 2 | 2020-02-26 | 3 | 0 | 0 | 4341372 | 0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 3 | 2020-02-27 | 2 | 0 | 0 | 4341373 | 0 | 4.500002 | 0.000000 | 0.000000 | 0.000000 |
| 4 | 2020-02-28 | 11 | 0 | 0 | 4341364 | 9 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 136 | 2020-07-09 | 1262 | 26251 | 4135 | 4309727 | 35 | 0.027141 | 0.030085 | 0.004501 | 0.784739 |
| 137 | 2020-07-10 | 1252 | 26289 | 4141 | 4309693 | 34 | 0.028250 | 0.034143 | 0.004219 | 0.736410 |
| 138 | 2020-07-11 | 1239 | 26332 | 4146 | 4309658 | 35 | 0.030434 | 0.045073 | 0.004440 | 0.614664 |
| 139 | 2020-07-12 | 1215 | 26388 | 4152 | 4309620 | 38 | 0.029912 | 0.039143 | 0.004685 | 0.682485 |
| 140 | 2020-07-13 | 1198 | 26436 | 4158 | 4309583 | 37 | 0.029172 | 0.034160 | 0.004504 | 0.754512 |
141 rows × 10 columns
real_df = dpc_regioni_df[dpc_regioni_df.denominazione_regione == regione][['data', 'totale_positivi', 'dimessi_guariti', 'deceduti', 'totale_casi', 'nuovi_positivi']]
real_df = real_df.query('20200714 > data')
real_df['suscettibili'] = pop - real_df['totale_casi']
real_df = real_df[['data', 'totale_positivi', 'dimessi_guariti', 'deceduti', 'suscettibili', 'nuovi_positivi']]
real_df
| data | totale_positivi | dimessi_guariti | deceduti | suscettibili | nuovi_positivi | |
|---|---|---|---|---|---|---|
| 13 | 2020-02-24 | 3 | 0 | 0 | 4341372 | 3 |
| 34 | 2020-02-25 | 3 | 0 | 0 | 4341372 | 0 |
| 55 | 2020-02-26 | 3 | 0 | 0 | 4341372 | 0 |
| 76 | 2020-02-27 | 2 | 0 | 0 | 4341373 | -1 |
| 97 | 2020-02-28 | 11 | 0 | 0 | 4341364 | 9 |
| ... | ... | ... | ... | ... | ... | ... |
| 2869 | 2020-07-09 | 1124 | 26243 | 4108 | 4309900 | 16 |
| 2890 | 2020-07-10 | 1075 | 26300 | 4110 | 4309890 | 10 |
| 2911 | 2020-07-11 | 1069 | 26314 | 4111 | 4309881 | 9 |
| 2932 | 2020-07-12 | 1058 | 26329 | 4111 | 4309877 | 4 |
| 2953 | 2020-07-13 | 1053 | 26339 | 4112 | 4309871 | 6 |
141 rows × 6 columns
# Run if we want to show only the real predicted points
# in the graph and not the entire line
#filtered_df['nuovi_positivi'] = [None]*(filtered_df.shape[0]-days_to_predict) + list(filtered_df.iloc[filtered_df.shape[0]-days_to_predict:]['nuovi_positivi'])
#filtered_df['deceduti'] = [None]*(filtered_df.shape[0]-days_to_predict) + list(filtered_df.iloc[filtered_df.shape[0]-days_to_predict:]['deceduti'])
#filtered_df['dimessi_guariti'] = [None]*(filtered_df.shape[0]-days_to_predict) + list(filtered_df.iloc[filtered_df.shape[0]-days_to_predict:]['dimessi_guariti'])
#filtered_df['totale_positivi'] = [None]*(filtered_df.shape[0]-days_to_predict) + list(filtered_df.iloc[filtered_df.shape[0]-days_to_predict:]['totale_positivi'])
general_plot(t=real_df['data'],
title='SIRD infected of ' + regione,
data=[real_df['nuovi_positivi'].values, filtered_df['nuovi_positivi'].values],
names=['Real', 'Prediction'],
modes=['markers', 'lines'],
blend_legend=False,
output_image=True)
mean_absolute_error(real_df['nuovi_positivi'].values, filtered_df['nuovi_positivi'].values), mean_squared_error(real_df['nuovi_positivi'].values, filtered_df['nuovi_positivi'].values)
(2.141843971631206, 48.09929078014184)
general_plot(t=real_df['data'],
title='SIRD deaths of ' + regione,
data=[real_df['deceduti'].values, filtered_df['deceduti'].values],
names=['Real', 'Prediction'],
modes=['markers', 'lines'],
blend_legend=False,
output_image=True)
mean_absolute_error(real_df['deceduti'].values, filtered_df['deceduti'].values), mean_squared_error(real_df['deceduti'].values, filtered_df['deceduti'].values)
(1.9361702127659575, 57.51063829787234)
general_plot(t=real_df['data'],
title='SIRD recovered of ' + regione,
data=[real_df['dimessi_guariti'].values, filtered_df['dimessi_guariti'].values],
names=['Real', 'Prediction'],
modes=['markers', 'lines'],
blend_legend=False,
output_image=True)
mean_absolute_error(real_df['dimessi_guariti'].values, filtered_df['dimessi_guariti'].values), mean_squared_error(real_df['dimessi_guariti'].values, filtered_df['dimessi_guariti'].values)
(3.8652482269503547, 244.9290780141844)
general_plot(t=real_df['data'],
title='SIRD cumulated infected of ' + regione,
data=[real_df['totale_positivi'].values, filtered_df['totale_positivi'].values],
names=['Real', 'Prediction'],
modes=['markers', 'lines'],
blend_legend=False,
output_image=True)
mean_absolute_error(real_df['totale_positivi'].values, filtered_df['totale_positivi'].values), mean_squared_error(real_df['totale_positivi'].values, filtered_df['totale_positivi'].values)
(12.28368794326241, 1627.404255319149)
days_to_predict = 14
model = DeterministicSird(
data_df=covidpro_df,
pop_prov_df=pop_prov_df,
prov_list_df=prov_list_df,
area="Firenze",
group_column='Province',
data_column='Date',
data_filter="20200630",
lag=11,
days_to_predict=days_to_predict,
is_regional=False,
pcm_data=dpc_regioni_df)
res = model.fit()
real_df = model.real_df
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
compart = "nuovi_positivi"
fig, ax = plt.subplots(figsize=(8,5))
locator = mdates.AutoDateLocator(minticks=3)
formatter = mdates.ConciseDateFormatter(locator)
ax.xaxis.set_major_locator(locator)
ax.xaxis.set_major_formatter(formatter)
ax.plot(res.data, res[compart].values, label='Predicted')
ax.scatter(real_df.data, real_df[compart].values, label='Real', s=3, color="black")
ax.axvline(real_df.iloc[-days_to_predict].data, linestyle="dashed", color="grey", alpha=0.3)
plt.ylabel("Number of individuals")
plt.legend()
ax_new = fig.add_axes([0.63, 0.6, 0.25, 0.25])
ax_new.plot(res.iloc[-days_to_predict:].data, res.iloc[-days_to_predict:][compart].values, label='Predicted')
ax_new.scatter(real_df.iloc[-days_to_predict:].data, real_df.iloc[-days_to_predict:][compart].values, label='Real', s=3, color="black")
locator = mdates.AutoDateLocator(minticks=2, maxticks=5)
formatter = mdates.ConciseDateFormatter(locator)
ax_new.xaxis.set_major_locator(locator)
ax_new.xaxis.set_major_formatter(formatter)
plt.show()
compart = "totale_positivi"
fig, ax = plt.subplots(figsize=(8,5))
locator = mdates.AutoDateLocator(minticks=3)
formatter = mdates.ConciseDateFormatter(locator)
ax.xaxis.set_major_locator(locator)
ax.xaxis.set_major_formatter(formatter)
ax.plot(res.data, res[compart].values, label='Predicted')
ax.scatter(real_df.data, real_df[compart].values, label='Real', s=3, color="black")
ax.axvline(real_df.iloc[-days_to_predict].data, linestyle="dashed", color="grey", alpha=0.3)
plt.ylabel("Number of individuals")
plt.legend()
ax_new = fig.add_axes([0.63, 0.6, 0.25, 0.25])
ax_new.plot(res.iloc[-days_to_predict:].data, res.iloc[-days_to_predict:][compart].values, label='Predicted')
ax_new.scatter(real_df.iloc[-days_to_predict:].data, real_df.iloc[-days_to_predict:][compart].values, label='Real', s=3, color="black")
locator = mdates.AutoDateLocator(minticks=2, maxticks=5)
formatter = mdates.ConciseDateFormatter(locator)
ax_new.xaxis.set_major_locator(locator)
ax_new.xaxis.set_major_formatter(formatter)
plt.show()
compart = "deceduti"
fig, ax = plt.subplots(figsize=(8,5))
locator = mdates.AutoDateLocator(minticks=3)
formatter = mdates.ConciseDateFormatter(locator)
ax.xaxis.set_major_locator(locator)
ax.xaxis.set_major_formatter(formatter)
ax.plot(res.data, res[compart].values, label='Predicted')
ax.scatter(real_df.data, real_df[compart].values, label='Real', s=3, color="black")
ax.axvline(real_df.iloc[-days_to_predict].data, linestyle="dashed", color="grey", alpha=0.3)
plt.ylabel("Number of individuals")
plt.legend()
ax_new = fig.add_axes([0.63, 0.25, 0.25, 0.25])
ax_new.plot(res.iloc[-days_to_predict:].data, res.iloc[-days_to_predict:][compart].values, label='Predicted')
ax_new.scatter(real_df.iloc[-days_to_predict:].data, real_df.iloc[-days_to_predict:][compart].values, label='Real', s=3, color="black")
locator = mdates.AutoDateLocator(minticks=2, maxticks=5)
formatter = mdates.ConciseDateFormatter(locator)
ax_new.xaxis.set_major_locator(locator)
ax_new.xaxis.set_major_formatter(formatter)
plt.show()
from tqdm import tqdm
lags = range(5,15)
days_range = range(5,31)
data_filter = '20200630'
tot_len = len(dpc_regioni_df.denominazione_regione.unique()) * len(lags) * len(days_range)
pbar = tqdm(total=tot_len)
results = []
for regione in dpc_regioni_df.denominazione_regione.unique():
for l in lags:
for d in days_range:
model = DeterministicSird(
data_df=dpc_regioni_df,
pop_prov_df=pop_prov_df,
prov_list_df=prov_list_df,
area=regione,
group_column='denominazione_regione',
data_column='data',
data_filter=data_filter,
lag=l,
days_to_predict=d)
res = model.fit()
realdf = model.real_df
if res.shape[0] != realdf.shape[0]:
print(regione, l, d, res.shape[0], realdf.shape[0])
mae_tot_pos = model.mae(compart='totale_positivi')
mse_tot_pos = model.mse(compart='totale_positivi')
mae_deaths = model.mae(compart='deceduti')
mse_deaths = model.mse(compart='deceduti')
mae_rec = model.mae(compart='dimessi_guariti')
mse_rec = model.mse(compart='dimessi_guariti')
mae_avg = np.mean([mae_tot_pos, mae_deaths, mae_rec])
mse_avg = np.mean([mse_tot_pos, mse_deaths, mse_rec])
results.append([
regione,
l,
d,
data_filter,
mae_avg,
mse_avg,
mae_tot_pos,
mse_tot_pos,
mae_deaths,
mse_deaths,
mae_rec,
mse_rec])
pbar.update(1)
pbar.close()
100%|██████████| 3200/3200 [04:03<00:00, 13.14it/s]
results_df = pd.DataFrame.from_records(results, columns=[
'region',
'lag',
'days_to_predict',
'train_last_date',
'avg_mae',
'avg_mse',
'mae_tot_pos',
'mse_tot_pos',
'mae_deaths',
'mse_deaths',
'mae_recovered',
'mse_recovered'])
results_df
| region | lag | days_to_predict | train_last_date | avg_mae | avg_mse | mae_tot_pos | mse_tot_pos | mae_deaths | mse_deaths | mae_recovered | mse_recovered | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Abruzzo | 5 | 5 | 20200630 | 2.118687 | 177.093434 | 3.431818 | 312.780303 | 0.053030 | 0.113636 | 2.871212 | 218.386364 |
| 1 | Abruzzo | 5 | 6 | 20200630 | 2.483709 | 204.097744 | 4.060150 | 367.338346 | 0.082707 | 0.233083 | 3.308271 | 244.721805 |
| 2 | Abruzzo | 5 | 7 | 20200630 | 2.855721 | 232.985075 | 4.731343 | 430.537313 | 0.119403 | 0.417910 | 3.716418 | 268.000000 |
| 3 | Abruzzo | 5 | 8 | 20200630 | 3.224691 | 262.483951 | 5.429630 | 499.948148 | 0.162963 | 0.681481 | 4.081481 | 286.822222 |
| 4 | Abruzzo | 5 | 9 | 20200630 | 3.615196 | 296.678922 | 6.191176 | 583.632353 | 0.213235 | 1.036765 | 4.441176 | 305.367647 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3035 | Veneto | 14 | 26 | 20200630 | 13.880174 | 3430.206972 | 29.261438 | 9488.176471 | 3.117647 | 64.895425 | 9.261438 | 737.549020 |
| 3036 | Veneto | 14 | 27 | 20200630 | 15.214286 | 4005.088745 | 32.389610 | 11122.155844 | 3.285714 | 69.935065 | 9.967532 | 823.175325 |
| 3037 | Veneto | 14 | 28 | 20200630 | 16.587097 | 4622.655914 | 35.612903 | 12876.361290 | 3.445161 | 74.541935 | 10.703226 | 917.064516 |
| 3038 | Veneto | 14 | 29 | 20200630 | 18.017094 | 5287.017094 | 38.929487 | 14754.134615 | 3.628205 | 80.628205 | 11.493590 | 1026.288462 |
| 3039 | Veneto | 14 | 30 | 20200630 | 19.513800 | 6035.072187 | 42.445860 | 16884.878981 | 3.828025 | 87.917197 | 12.267516 | 1132.420382 |
5200 rows × 12 columns
results_df.replace(0, np.nan, inplace=True)
results_df.groupby('region').agg(['min'])
| lag | days_to_predict | avg_mae | avg_mse | mae_tot_pos | mse_tot_pos | mae_deaths | mse_deaths | mae_recovered | mse_recovered | |
|---|---|---|---|---|---|---|---|---|---|---|
| min | min | min | min | min | min | min | min | min | min | |
| region | ||||||||||
| Abruzzo | 5 | 5 | 1.936869 | 146.967172 | 2.772727 | 204.636364 | 0.053030 | 0.113636 | 2.871212 | 218.386364 |
| Basilicata | 5 | 5 | 0.017677 | 0.017677 | 0.030303 | 0.030303 | 0.006803 | 0.006803 | 0.022727 | 0.022727 |
| Calabria | 5 | 5 | 0.025253 | 0.045455 | 0.007463 | 0.007463 | NaN | NaN | 0.045455 | 0.090909 |
| Campania | 5 | 5 | 0.320707 | 6.118687 | 0.818182 | 17.893939 | 0.037879 | 0.037879 | 0.083333 | 0.265152 |
| Emilia-Romagna | 5 | 5 | 0.828283 | 36.025253 | 1.219697 | 69.871212 | 0.166667 | 0.772727 | 1.015152 | 36.515152 |
| Friuli Venezia Giulia | 5 | 5 | 0.714646 | 22.876263 | 1.174242 | 41.734848 | NaN | NaN | 0.954545 | 26.212121 |
| Lazio | 5 | 5 | 0.434343 | 12.661616 | 0.560606 | 21.787879 | 0.014925 | 0.014925 | 0.704545 | 16.143939 |
| Liguria | 5 | 5 | 0.179293 | 1.267677 | 0.075758 | 0.257576 | 0.151515 | 0.742424 | 0.204545 | 1.386364 |
| Lombardia | 5 | 5 | 7.515152 | 2542.101010 | 9.984848 | 3265.666667 | 0.166667 | 1.015152 | 12.295455 | 4326.909774 |
| Marche | 5 | 5 | 0.797980 | 24.838384 | 1.151515 | 38.227273 | 0.121212 | 0.484848 | 1.106061 | 34.818182 |
| Molise | 5 | 5 | 0.030303 | 0.060606 | 0.044776 | 0.089552 | NaN | NaN | 0.030303 | 0.030303 |
| Piemonte | 5 | 5 | 1.401515 | 76.648990 | 1.780303 | 86.053030 | 0.090909 | 0.257576 | 2.257576 | 137.852941 |
| Puglia | 5 | 5 | 0.191919 | 1.707071 | 0.318182 | 3.212121 | 0.022388 | 0.037313 | 0.234848 | 1.871212 |
| Sardegna | 5 | 5 | 0.035354 | 0.065657 | 0.037594 | 0.082707 | 0.037879 | 0.037879 | 0.030303 | 0.075758 |
| Sicilia | 5 | 5 | 0.093434 | 0.345960 | 0.136364 | 0.560606 | 0.037879 | 0.037879 | 0.106061 | 0.439394 |
| Toscana | 5 | 5 | 0.116162 | 0.540404 | 0.060606 | 0.136364 | 0.083333 | 0.234848 | 0.174242 | 0.946970 |
| Trentino Alto Adige | 5 | 5 | 0.694444 | 21.512626 | 1.121212 | 35.742424 | 0.006667 | 0.006667 | 0.962121 | 28.795455 |
| Umbria | 5 | 5 | 0.042929 | 0.154040 | 0.037313 | 0.052239 | 0.030303 | 0.030303 | 0.083333 | 0.295455 |
| Valle d'Aosta | 5 | 5 | 0.017677 | 0.017677 | 0.022727 | 0.022727 | 0.022727 | 0.022727 | 0.028986 | 0.028986 |
| Veneto | 5 | 5 | 0.542929 | 11.260101 | 0.590909 | 13.015152 | 0.310606 | 2.901515 | 0.681818 | 16.858209 |
results_df.to_csv("../results/sirddet_region.csv", index=False)
from tqdm import tqdm
lags = range(5,15)
days_range = range(15,31)
data_filter = '20200630'
tot_len = len(covidpro_df.Province.unique()) * len(lags) * len(days_range)
pbar = tqdm(total=tot_len)
results = []
for prov in covidpro_df.Province.unique():
for l in lags:
for d in days_range:
model = DeterministicSird(
data_df=covidpro_df,
pop_prov_df=pop_prov_df,
prov_list_df=prov_list_df,
area=prov,
group_column='Province',
data_column='Date',
data_filter=data_filter,
lag=l,
days_to_predict=d,
is_regional=False,
pcm_data=dpc_regioni_df)
try:
res = model.fit()
realdf = model.real_df
if res.shape[0] != realdf.shape[0]:
print(prov, l, d, res.shape[0], realdf.shape[0])
mae_tot_pos = model.mae(compart='totale_positivi')
mse_tot_pos = model.mse(compart='totale_positivi')
mae_deaths = model.mae(compart='deceduti')
mse_deaths = model.mse(compart='deceduti')
mae_rec = model.mae(compart='dimessi_guariti')
mse_rec = model.mse(compart='dimessi_guariti')
mae_avg = np.mean([mae_tot_pos, mae_deaths, mae_rec])
mse_avg = np.mean([mse_tot_pos, mse_deaths, mse_rec])
results.append([
prov,
l,
d,
data_filter,
mae_avg,
mse_avg,
mae_tot_pos,
mse_tot_pos,
mae_deaths,
mse_deaths,
mae_rec,
mse_rec])
except Exception as e:
print(prov, l, d)
print(e)
results.append([
prov,
l,
d,
data_filter,
0,
0,
0,
0,
0,
0,
0,
0])
pbar.update(1)
pbar.close()
21%|██▏ | 3106/14560 [03:54<14:20, 13.32it/s]Chieti 8 30 156 156 Cannot convert non-finite values (NA or inf) to integer 64%|██████▍ | 9389/14560 [12:14<06:33, 13.13it/s]Pistoia 11 26 152 152 Cannot convert non-finite values (NA or inf) to integer Pistoia 11 27 152 152 Cannot convert non-finite values (NA or inf) to integer Pistoia 11 28 152 152 Cannot convert non-finite values (NA or inf) to integer 65%|██████▍ | 9393/14560 [12:15<06:47, 12.69it/s]Pistoia 11 29 152 152 Cannot convert non-finite values (NA or inf) to integer Pistoia 11 30 152 152 Cannot convert non-finite values (NA or inf) to integer 65%|██████▍ | 9405/14560 [12:16<06:38, 12.92it/s]Pistoia 12 25 151 151 Cannot convert non-finite values (NA or inf) to integer Pistoia 12 26 151 151 Cannot convert non-finite values (NA or inf) to integer Pistoia 12 27 151 151 Cannot convert non-finite values (NA or inf) to integer 65%|██████▍ | 9407/14560 [12:16<07:08, 12.01it/s]Pistoia 12 28 151 151 Cannot convert non-finite values (NA or inf) to integer Pistoia 12 29 151 151 Cannot convert non-finite values (NA or inf) to integer 65%|██████▍ | 9409/14560 [12:16<07:14, 11.85it/s]Pistoia 12 30 151 151 Cannot convert non-finite values (NA or inf) to integer 65%|██████▍ | 9423/14560 [12:17<06:59, 12.24it/s]Pistoia 13 27 153 153 Cannot convert non-finite values (NA or inf) to integer Pistoia 13 28 153 153 Cannot convert non-finite values (NA or inf) to integer Pistoia 13 29 153 153 Cannot convert non-finite values (NA or inf) to integer 65%|██████▍ | 9425/14560 [12:17<07:06, 12.04it/s]Pistoia 13 30 153 153 Cannot convert non-finite values (NA or inf) to integer 65%|██████▍ | 9441/14560 [12:19<06:55, 12.33it/s]Pistoia 14 29 155 155 Cannot convert non-finite values (NA or inf) to integer Pistoia 14 30 155 155 Cannot convert non-finite values (NA or inf) to integer 77%|███████▋ | 11150/14560 [14:41<04:41, 12.12it/s]Rovigo 11 27 153 153 Cannot convert non-finite values (NA or inf) to integer Rovigo 11 28 153 153 Cannot convert non-finite values (NA or inf) to integer Rovigo 11 29 153 153 Cannot convert non-finite values (NA or inf) to integer 77%|███████▋ | 11154/14560 [14:42<04:27, 12.75it/s]Rovigo 11 30 153 153 Cannot convert non-finite values (NA or inf) to integer 77%|███████▋ | 11182/14560 [14:44<07:43, 7.29it/s]Rovigo 13 27 153 153 Cannot convert non-finite values (NA or inf) to integer Rovigo 13 28 153 153 Cannot convert non-finite values (NA or inf) to integer 77%|███████▋ | 11184/14560 [14:44<07:40, 7.33it/s]Rovigo 13 29 153 153 Cannot convert non-finite values (NA or inf) to integer Rovigo 13 30 153 153 Cannot convert non-finite values (NA or inf) to integer 77%|███████▋ | 11198/14560 [14:46<05:21, 10.44it/s]Rovigo 14 27 153 153 Cannot convert non-finite values (NA or inf) to integer Rovigo 14 28 153 153 Cannot convert non-finite values (NA or inf) to integer Rovigo 14 29 153 153 Cannot convert non-finite values (NA or inf) to integer 77%|███████▋ | 11202/14560 [14:46<04:58, 11.27it/s]Rovigo 14 30 153 153 Cannot convert non-finite values (NA or inf) to integer 100%|██████████| 14560/14560 [19:11<00:00, 12.65it/s]
results_df = pd.DataFrame.from_records(results, columns=[
'province',
'lag',
'days_to_predict',
'train_last_date',
'avg_mae',
'avg_mse',
'mae_tot_pos',
'mse_tot_pos',
'mae_deaths',
'mse_deaths',
'mae_recovered',
'mse_recovered'])
results_df
| province | lag | days_to_predict | train_last_date | avg_mae | avg_mse | mae_tot_pos | mse_tot_pos | mae_deaths | mse_deaths | mae_recovered | mse_recovered | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Agrigento | 5 | 5 | 20200630 | 0.313131 | 0.225770 | 0.480507 | 0.358012 | NaN | NaN | 0.458887 | 0.319299 |
| 1 | Agrigento | 5 | 6 | 20200630 | 0.315789 | 0.230153 | 0.487883 | 0.371381 | NaN | NaN | 0.459485 | 0.319079 |
| 2 | Agrigento | 5 | 7 | 20200630 | 0.320896 | 0.239721 | 0.496448 | 0.388572 | NaN | NaN | 0.466239 | 0.330591 |
| 3 | Agrigento | 5 | 8 | 20200630 | 0.325926 | 0.249206 | 0.505169 | 0.406446 | NaN | NaN | 0.472609 | 0.341171 |
| 4 | Agrigento | 5 | 9 | 20200630 | 0.330882 | 0.259049 | 0.509814 | 0.412961 | NaN | NaN | 0.482833 | 0.364185 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14555 | Viterbo | 14 | 26 | 20200630 | 5.775599 | 419.457370 | 8.629768 | 658.381501 | NaN | NaN | 8.697030 | 599.990609 |
| 14556 | Viterbo | 14 | 27 | 20200630 | 6.292208 | 487.807799 | 9.442838 | 770.429821 | NaN | NaN | 9.433786 | 692.993576 |
| 14557 | Viterbo | 14 | 28 | 20200630 | 6.862366 | 573.014560 | 10.423526 | 933.626700 | NaN | NaN | 10.163571 | 785.416980 |
| 14558 | Viterbo | 14 | 29 | 20200630 | 7.457265 | 665.311084 | 11.381244 | 1091.390949 | NaN | NaN | 10.990551 | 904.542304 |
| 14559 | Viterbo | 14 | 30 | 20200630 | 8.104034 | 776.884049 | 12.498323 | 1306.606801 | NaN | NaN | 11.813779 | 1024.045348 |
23660 rows × 12 columns
results_df.replace(0, np.nan, inplace=True)
results_df.groupby('province').agg(['min'])
| lag | days_to_predict | avg_mae | avg_mse | mae_tot_pos | mse_tot_pos | mae_deaths | mse_deaths | mae_recovered | mse_recovered | |
|---|---|---|---|---|---|---|---|---|---|---|
| min | min | min | min | min | min | min | min | min | min | |
| province | ||||||||||
| Agrigento | 5 | 5 | 0.311106 | 0.223745 | 0.480507 | 0.358012 | NaN | NaN | 0.451282 | 0.309612 |
| Alessandria | 5 | 5 | 0.590241 | 4.692198 | 0.863751 | 6.565015 | 0.022059 | 0.022059 | 0.846367 | 6.526906 |
| Ancona | 5 | 5 | 0.426265 | 1.470590 | 0.602994 | 1.464260 | NaN | NaN | 0.645496 | 2.392494 |
| Aosta | 5 | 5 | 0.025253 | 0.025253 | 0.045455 | 0.045455 | 0.022727 | 0.022727 | 0.028986 | 0.028986 |
| Arezzo | 5 | 5 | 0.322758 | 0.220317 | 0.513080 | 0.353638 | 0.007576 | 0.007576 | 0.440042 | 0.287650 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| Vercelli | 5 | 5 | 0.391314 | 0.862030 | 0.521279 | 1.013505 | 0.030303 | 0.060606 | 0.614785 | 1.402028 |
| Verona | 5 | 5 | 0.404493 | 0.626899 | 0.576600 | 0.852278 | 0.060606 | 0.121212 | 0.576273 | 0.872536 |
| Vibo Valentia | 5 | 5 | 0.313131 | 0.232640 | 0.479336 | 0.338798 | NaN | NaN | 0.452482 | 0.337278 |
| Vicenza | 5 | 5 | 0.500357 | 1.669631 | 0.644708 | 1.881954 | 0.257576 | 1.969697 | 0.555596 | 0.557221 |
| Viterbo | 5 | 5 | 0.472222 | 1.946403 | 0.819432 | 4.176457 | 0.006410 | 0.006410 | 0.544205 | 0.805113 |
91 rows × 10 columns
results_df.to_csv("../results/sirddet_province.csv", index=False)
results_df = pd.read_csv("../results/sirddet_province.csv")
plot_df = results_df.loc[(results_df.province == "Firenze") & (results_df.days_to_predict == 14)]
plot_df.rename(columns={"lag": "Lag", "mae_tot_pos": "MAE", "days_to_predict": "Days to predict"}, inplace=True)
from IPython.display import Image
Image(plot_df.plot(x='Lag', y='MAE', backend="plotly", template="plotly_white").to_image(format="png", width=800, height=400, scale=2))
plot_df = results_df.loc[(results_df.province == "Firenze") & (results_df.lag == 7)]
plot_df.rename(columns={"lag": "Lag", "mae_tot_pos": "MAE", "days_to_predict": "Days to predict"}, inplace=True)
Image(plot_df.plot(x='Days to predict', y='MAE', backend="plotly", template="plotly_white").to_image(format="png", width=800, height=400, scale=2))
plot_df = results_df.loc[(results_df.province == "Firenze"), ["lag", "days_to_predict", "mae_tot_pos"]]
plot_df.rename(columns={"lag": "Lag", "mae_tot_pos": "MAE", "days_to_predict": "Days to predict"}, inplace=True)
import plotly.express as px
fig = px.scatter(plot_df, x="Lag", y="MAE", color='Days to predict', template='plotly_white')
Image(fig.to_image(format="png", width=600, height=400, scale=2))
# TODO: finish implementation and fix prediction dates
test_size = 7
lags = 7
prov = 'Firenze'
df = covidpro_df.loc[(covidpro_df.Province == prov)].query("20200814 > Date")
df.reset_index(inplace=True, drop=True)
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=20, test_size=test_size)
results = []
for train_index, test_index in tscv.split(df):
data_df = pd.concat([df.iloc[train_index], df.iloc[test_index]])
data_filter = df.iloc[train_index[-1]].Date.strftime("%Y%m%d")
model = DeterministicSird(
data_df=data_df,
pop_prov_df=pop_prov_df,
prov_list_df=prov_list_df,
area=prov,
group_column='Province',
data_column='Date',
data_filter=data_filter,
lag=lags,
days_to_predict=test_size,
is_regional=False,
pcm_data=dpc_regioni_df)
res = model.fit()
realdf = model.real_df
mae_tot_pos = model.mae(compart='nuovi_positivi')
mse_tot_pos = model.mse(compart='nuovi_positivi')
mae_deaths = model.mae(compart='deceduti')
mse_deaths = model.mse(compart='deceduti')
mae_rec = model.mae(compart='dimessi_guariti')
mse_rec = model.mse(compart='dimessi_guariti')
mae_avg = np.mean([mae_tot_pos, mae_deaths, mae_rec])
mse_avg = np.mean([mse_tot_pos, mse_deaths, mse_rec])
results.append([
prov,
lags,
test_size,
data_filter,
mae_avg,
mse_avg,
mae_tot_pos,
mse_tot_pos,
mae_deaths,
mse_deaths,
mae_rec,
mse_rec])
results_df = pd.DataFrame.from_records(results, columns=[
'province',
'lag',
'days_to_predict',
'train_last_date',
'avg_mae',
'avg_mse',
'mae_tot_pos',
'mse_tot_pos',
'mae_deaths',
'mse_deaths',
'mae_recovered',
'mse_recovered'])
results_df
| province | lag | days_to_predict | train_last_date | avg_mae | avg_mse | mae_tot_pos | mse_tot_pos | mae_deaths | mse_deaths | mae_recovered | mse_recovered | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Firenze | 7 | 7 | 20200326 | 4.368980 | 262.286783 | 10.078947 | 760.710526 | 1.500000 | 16.605263 | 1.527992 | 9.544561 |
| 1 | Firenze | 7 | 7 | 20200402 | 5.278754 | 389.942768 | 10.377778 | 1022.955556 | 0.733333 | 4.777778 | 4.725150 | 142.094971 |
| 2 | Firenze | 7 | 7 | 20200409 | 4.340713 | 260.151476 | 7.942308 | 646.750000 | 0.846154 | 7.692308 | 4.233677 | 126.012119 |
| 3 | Firenze | 7 | 7 | 20200416 | 6.807768 | 855.915973 | 6.288136 | 350.762712 | 0.406780 | 1.830508 | 13.728389 | 2215.154699 |
| 4 | Firenze | 7 | 7 | 20200423 | 8.337864 | 1626.566762 | 3.257576 | 138.015152 | 1.363636 | 19.151515 | 20.392379 | 4722.533619 |
| 5 | Firenze | 7 | 7 | 20200430 | 3.644492 | 282.493487 | 3.493151 | 133.931507 | 0.684932 | 8.082192 | 6.755393 | 705.466763 |
| 6 | Firenze | 7 | 7 | 20200507 | 5.547599 | 852.746344 | 2.412500 | 70.662500 | 0.325000 | 1.850000 | 13.905297 | 2485.726531 |
| 7 | Firenze | 7 | 7 | 20200514 | 1.618123 | 65.096294 | 0.712644 | 9.057471 | 0.448276 | 3.114943 | 3.693451 | 183.116467 |
| 8 | Firenze | 7 | 7 | 20200521 | 2.313553 | 235.434348 | 0.414894 | 2.648936 | 0.127660 | 0.489362 | 6.398105 | 703.164746 |
| 9 | Firenze | 7 | 7 | 20200528 | 0.637605 | 6.933256 | 0.336634 | 1.920792 | 0.118812 | 0.297030 | 1.457369 | 18.581945 |
| 10 | Firenze | 7 | 7 | 20200604 | 0.613887 | 11.222339 | 0.175926 | 0.490741 | 0.027778 | 0.027778 | 1.637959 | 33.148498 |
| 11 | Firenze | 7 | 7 | 20200611 | 0.697315 | 13.814640 | 0.165217 | 0.495652 | 0.095652 | 0.165217 | 1.831076 | 40.783050 |
| 12 | Firenze | 7 | 7 | 20200618 | 1.050186 | 220.758789 | 0.122951 | 0.270492 | 0.057377 | 0.090164 | 2.970230 | 661.915710 |
| 13 | Firenze | 7 | 7 | 20200625 | 0.196916 | 0.210930 | 0.069767 | 0.147287 | 0.000000 | 0.000000 | 0.520980 | 0.485503 |
| 14 | Firenze | 7 | 7 | 20200702 | 0.321356 | 0.980925 | 0.183824 | 1.036765 | 0.132353 | 0.397059 | 0.647892 | 1.508951 |
| 15 | Firenze | 7 | 7 | 20200709 | 0.236990 | 0.410854 | 0.125874 | 0.363636 | 0.020979 | 0.020979 | 0.564118 | 0.847947 |
| 16 | Firenze | 7 | 7 | 20200716 | 0.217220 | 0.360876 | 0.113333 | 0.633333 | 0.033333 | 0.033333 | 0.504993 | 0.415962 |
| 17 | Firenze | 7 | 7 | 20200723 | 0.225014 | 0.495389 | 0.159236 | 0.949045 | 0.006369 | 0.006369 | 0.509438 | 0.530753 |
| 18 | Firenze | 7 | 7 | 20200730 | 0.262106 | 0.675656 | 0.152439 | 0.567073 | 0.012195 | 0.012195 | 0.621684 | 1.447700 |
| 19 | Firenze | 7 | 7 | 20200806 | 0.409382 | 4.560591 | 0.157895 | 1.163743 | 0.000000 | 0.000000 | 1.070252 | 12.518031 |
results_df.avg_mae.mean()
0.9981593822500409
results_df.avg_mae.mean()
0.9981593822500409
import matplotlib.pyplot as plt
plt.hist(results_df.loc[results_df.mae_tot_pos < results_df.avg_mae.mean()].avg_mae)
plt.show()
alpha = 0.95
pl = ((1.0-alpha)/2.0) * 100
pu = (alpha+((1.0-alpha)/2.0)) * 100
lower = max(0.0, np.percentile(results_df.avg_mae, pl))
upper = min(1.0, np.percentile(results_df.avg_mae, pu))
print('%.0f%% confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))
95% confidence interval 37.0% and 100.0%
model = DeterministicSird(
data_df=dpc_regioni_df,
pop_prov_df=pop_prov_df,
prov_list_df=prov_list_df,
area="Piemonte",
group_column='denominazione_regione',
data_column='data',
data_filter="20200415",
lag=14,
days_to_predict=14)
res = model.fit()
realdf = model.real_df
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
fig, ax = plt.subplots(figsize=(8,5))
locator = mdates.AutoDateLocator(minticks=3)
formatter = mdates.ConciseDateFormatter(locator)
ax.xaxis.set_major_locator(locator)
ax.xaxis.set_major_formatter(formatter)
ax.scatter(realdf.data, realdf['totale_positivi'].values, label='Real', s=3, color="black")
ax.plot(res.data, res['totale_positivi'].values, label='Fitted')
ax.axvline(res.iloc[-14].data, linestyle="dashed", color="grey", alpha=0.3)
plt.legend()
plt.show()
model.mae(compart="totale_positivi"), model.mse(compart="totale_positivi")
(149.26153846153846, 225841.1076923077)
fig, ax = plt.subplots(figsize=(8,5))
locator = mdates.AutoDateLocator(minticks=3)
formatter = mdates.ConciseDateFormatter(locator)
ax.xaxis.set_major_locator(locator)
ax.xaxis.set_major_formatter(formatter)
ax.scatter(realdf.data, realdf['dimessi_guariti'].values, label='Real', s=3, color="black")
ax.plot(res.data, res['dimessi_guariti'].values, label='Fitted')
ax.axvline(res.iloc[-14].data, linestyle="dashed", color="grey", alpha=0.3)
plt.legend()
plt.ylabel("Number of individuals")
plt.title("Recovered")
plt.show()
model.mae(compart="dimessi_guariti"), model.mse(compart="dimessi_guariti")
(269.9846153846154, 524772.323076923)
fig, ax = plt.subplots(figsize=(8,5))
locator = mdates.AutoDateLocator(minticks=3)
formatter = mdates.ConciseDateFormatter(locator)
ax.xaxis.set_major_locator(locator)
ax.xaxis.set_major_formatter(formatter)
ax.scatter(realdf.data, realdf['deceduti'].values, label='Real', s=3, color="black")
ax.plot(res.data, res['deceduti'].values, label='Fitted')
ax.axvline(res.iloc[-14].data, linestyle="dashed", color="grey", alpha=0.3)
plt.legend()
plt.ylabel("Number of individuals")
plt.title("Total deaths")
plt.show()
model.mae(compart="deceduti"), model.mse(compart="deceduti")
(159.72307692307692, 176929.72307692308)